# Data reading and visualization
import os
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.offline as py
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
# Statistical analysis
from scipy.stats import norm
# Scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
# XGBoost & LightGBM
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# Remove unwanted warnings from outputs
import warnings
warnings.filterwarnings('ignore')
# Set themes
sns.set_style('darkgrid')
plt.style.use('seaborn-bright')
# PDF Rendering
import plotly.io as pio
pio.renderers.default = "notebook+pdf"
BASE_PATH = "./"
rf_params = {
'n_estimators': 100,
'max_depth': 2,
'min_samples_split': 2,
'min_samples_leaf': 1
}
xgb_params = {
'n_estimators': 500,
'max_depth': 3,
'min_child_weight': 2,
'learning_rate': 0.01,
'subsample': 0.8,
'colsample_bytree': 0.8,
'booster': 'gbtree'
}
lgb_params = {
'n_estimators': 500,
'max_depth': 3,
'learning_rate': 0.01,
'subsample': 0.8,
'colsample_bytree': 0.8,
'objective': 'regression'
}
# Print dataset usage stats
def print_info(df):
print(f"\nDataframe Shape: {df.shape}")
print(f"\nDataframe Columns: {df.columns}")
print(f"\nDataframe dtypes: \n{df.dtypes.value_counts()}")
print(
f"\nDataframe memory usage: {round(df.memory_usage().sum() / 1024**2, 2)} MB")
# Visualize misssing data
def visualize_missing_data(df):
m_data = (df.isnull().sum() / len(df)) * 100
m_data = m_data.drop(m_data[m_data == 0].index).sort_values()
m_data = m_data.rename({'index': 'Feature', 0: 'Missing (%)'})
fig = px.bar(x=m_data.index, y=m_data,
title='Missing Data by Feature', template='plotly_dark')
fig.update_xaxes(title_text="Feature")
fig.update_yaxes(title_text="Missing (%)")
py.iplot(fig)
# Plot Histogram of dataset
def plot_histogram(df, distline=True):
if distline:
fig = plt.figure(figsize=(15, 15))
for i, column in enumerate(df.columns):
plt.subplot(4, 4, i+1)
plt.title(column)
plt.xlabel(column)
sns.distplot(df[column], fit=norm)
plt.tight_layout()
plt.show()
if not distline:
fig = plt.figure(figsize=(15, 15))
hist = df.hist(figsize=(15, 15), bins=50)
plt.tight_layout()
plt.show()
plt.tight_layout()
plt.show()
# Fit data to model(s)
def fit_robust_pipeline(model, X_train, y_train, X_test, y_test):
pipe = Pipeline([('scaler', RobustScaler()), ('model', model)])
pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)
return round(score, 2)
# Scaling Data
def scale_data(scaler, df, feats_to_transform):
scaled_df = df.copy()
features = scaled_df[feats_to_transform]
features = scaler.fit_transform(features.values)
scaled_df[feats_to_transform] = features
return scaled_df
# Encode Data
def encode_data(df, encoder_type, feats_to_encode):
if encoder_type == 'Label':
for feat in cat_feats:
label_enc = LabelEncoder()
label_enc_df[feat] = label_enc.fit_transform(label_enc_df[feat])
return label_enc_df
elif encoder_type == 'OneHot':
for feat in cat_feats:
onehot_enc = OneHotEncoder(handle_unknown='ignore')
transformed = pd.DataFrame(onehot_enc.fit_transform(onehot_enc_df[feat].values.reshape(-1, 1)).toarray())
transformed.columns = [f"{feat}_{i}" for i in transformed.columns]
onehot_enc_df = onehot_enc_df.join(transformed)
onehot_enc_df.drop([feat], axis=1, inplace=True)
return onehot_enc_df
else:
print("Invalid Encoder Type")
# Fit data to model(s)
def evaluate_performance(X, Y, test_size=0.2, scale_data=False, scaler=None, feats_to_transform=None):
# Define models
rf = RandomForestRegressor(**rf_params)
xgb = XGBRegressor(**xgb_params)
lgb = LGBMRegressor(**lgb_params)
# Transform data
if scale_data:
X = scale_data(scaler, X, feats_to_transform)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
# Pass models to pipeline
rf_score = fit_robust_pipeline(rf, X_train, y_train, X_test, y_test)
xgb_score = fit_robust_pipeline(xgb, X_train, y_train, X_test, y_test)
lgb_score = fit_robust_pipeline(lgb, X_train, y_train, X_test, y_test)
# Print scores
print(f"Model Scores: \n{'-'*25}\n")
print(f"RandomForestRegressor Score: {rf_score}")
print(f"XGBRegressor Score: {xgb_score}")
print(f"LGBMRegressor Score: {lgb_score}")
return rf, xgb, lgb
# Plot feature importance
def plot_feature_importance(features, title, model):
fig = px.bar(y=features, x=model.feature_importances_, template='plotly_dark')
fig.update_layout(title=f"{title}")
fig.update_xaxes(title_text="Feature Importance")
fig.update_yaxes(title_text="Feature")
py.iplot(fig)
# Read data
df = pd.read_csv(os.path.join(BASE_PATH, "dataset/train.csv"))
# Check shape of dataset
print_info(df)
Dataframe Shape: (1460, 81)
Dataframe Columns: Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition', 'SalePrice'],
dtype='object')
Dataframe dtypes:
object 43
int64 35
float64 3
dtype: int64
Dataframe memory usage: 0.9 MB
# Tabular representation of statistics
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Id | 1460.0 | 730.500000 | 421.610009 | 1.0 | 365.75 | 730.5 | 1095.25 | 1460.0 |
| MSSubClass | 1460.0 | 56.897260 | 42.300571 | 20.0 | 20.00 | 50.0 | 70.00 | 190.0 |
| LotFrontage | 1201.0 | 70.049958 | 24.284752 | 21.0 | 59.00 | 69.0 | 80.00 | 313.0 |
| LotArea | 1460.0 | 10516.828082 | 9981.264932 | 1300.0 | 7553.50 | 9478.5 | 11601.50 | 215245.0 |
| OverallQual | 1460.0 | 6.099315 | 1.382997 | 1.0 | 5.00 | 6.0 | 7.00 | 10.0 |
| OverallCond | 1460.0 | 5.575342 | 1.112799 | 1.0 | 5.00 | 5.0 | 6.00 | 9.0 |
| YearBuilt | 1460.0 | 1971.267808 | 30.202904 | 1872.0 | 1954.00 | 1973.0 | 2000.00 | 2010.0 |
| YearRemodAdd | 1460.0 | 1984.865753 | 20.645407 | 1950.0 | 1967.00 | 1994.0 | 2004.00 | 2010.0 |
| MasVnrArea | 1452.0 | 103.685262 | 181.066207 | 0.0 | 0.00 | 0.0 | 166.00 | 1600.0 |
| BsmtFinSF1 | 1460.0 | 443.639726 | 456.098091 | 0.0 | 0.00 | 383.5 | 712.25 | 5644.0 |
| BsmtFinSF2 | 1460.0 | 46.549315 | 161.319273 | 0.0 | 0.00 | 0.0 | 0.00 | 1474.0 |
| BsmtUnfSF | 1460.0 | 567.240411 | 441.866955 | 0.0 | 223.00 | 477.5 | 808.00 | 2336.0 |
| TotalBsmtSF | 1460.0 | 1057.429452 | 438.705324 | 0.0 | 795.75 | 991.5 | 1298.25 | 6110.0 |
| 1stFlrSF | 1460.0 | 1162.626712 | 386.587738 | 334.0 | 882.00 | 1087.0 | 1391.25 | 4692.0 |
| 2ndFlrSF | 1460.0 | 346.992466 | 436.528436 | 0.0 | 0.00 | 0.0 | 728.00 | 2065.0 |
| LowQualFinSF | 1460.0 | 5.844521 | 48.623081 | 0.0 | 0.00 | 0.0 | 0.00 | 572.0 |
| GrLivArea | 1460.0 | 1515.463699 | 525.480383 | 334.0 | 1129.50 | 1464.0 | 1776.75 | 5642.0 |
| BsmtFullBath | 1460.0 | 0.425342 | 0.518911 | 0.0 | 0.00 | 0.0 | 1.00 | 3.0 |
| BsmtHalfBath | 1460.0 | 0.057534 | 0.238753 | 0.0 | 0.00 | 0.0 | 0.00 | 2.0 |
| FullBath | 1460.0 | 1.565068 | 0.550916 | 0.0 | 1.00 | 2.0 | 2.00 | 3.0 |
| HalfBath | 1460.0 | 0.382877 | 0.502885 | 0.0 | 0.00 | 0.0 | 1.00 | 2.0 |
| BedroomAbvGr | 1460.0 | 2.866438 | 0.815778 | 0.0 | 2.00 | 3.0 | 3.00 | 8.0 |
| KitchenAbvGr | 1460.0 | 1.046575 | 0.220338 | 0.0 | 1.00 | 1.0 | 1.00 | 3.0 |
| TotRmsAbvGrd | 1460.0 | 6.517808 | 1.625393 | 2.0 | 5.00 | 6.0 | 7.00 | 14.0 |
| Fireplaces | 1460.0 | 0.613014 | 0.644666 | 0.0 | 0.00 | 1.0 | 1.00 | 3.0 |
| GarageYrBlt | 1379.0 | 1978.506164 | 24.689725 | 1900.0 | 1961.00 | 1980.0 | 2002.00 | 2010.0 |
| GarageCars | 1460.0 | 1.767123 | 0.747315 | 0.0 | 1.00 | 2.0 | 2.00 | 4.0 |
| GarageArea | 1460.0 | 472.980137 | 213.804841 | 0.0 | 334.50 | 480.0 | 576.00 | 1418.0 |
| WoodDeckSF | 1460.0 | 94.244521 | 125.338794 | 0.0 | 0.00 | 0.0 | 168.00 | 857.0 |
| OpenPorchSF | 1460.0 | 46.660274 | 66.256028 | 0.0 | 0.00 | 25.0 | 68.00 | 547.0 |
| EnclosedPorch | 1460.0 | 21.954110 | 61.119149 | 0.0 | 0.00 | 0.0 | 0.00 | 552.0 |
| 3SsnPorch | 1460.0 | 3.409589 | 29.317331 | 0.0 | 0.00 | 0.0 | 0.00 | 508.0 |
| ScreenPorch | 1460.0 | 15.060959 | 55.757415 | 0.0 | 0.00 | 0.0 | 0.00 | 480.0 |
| PoolArea | 1460.0 | 2.758904 | 40.177307 | 0.0 | 0.00 | 0.0 | 0.00 | 738.0 |
| MiscVal | 1460.0 | 43.489041 | 496.123024 | 0.0 | 0.00 | 0.0 | 0.00 | 15500.0 |
| MoSold | 1460.0 | 6.321918 | 2.703626 | 1.0 | 5.00 | 6.0 | 8.00 | 12.0 |
| YrSold | 1460.0 | 2007.815753 | 1.328095 | 2006.0 | 2007.00 | 2008.0 | 2009.00 | 2010.0 |
| SalePrice | 1460.0 | 180921.195890 | 79442.502883 | 34900.0 | 129975.00 | 163000.0 | 214000.00 | 755000.0 |
# Check for null values
visualize_missing_data(df)
# Filter and keep only the numerical data
df_num = df.select_dtypes(include=['float64', 'int64'])
# Filter out more catgeorical and time-series data given in `data_description.txt`
to_drop = ['Id', 'MSSubClass', 'OverallQual', 'OverallCond', 'YearRemodAdd','MoSold', 'YrSold', 'FullBath', 'HalfBath', 'Fireplaces', 'GarageCars']
df_num.drop(to_drop, axis=1, inplace=True)
# Even more filtering to remove miscellanous features
to_drop = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt', 'MiscVal', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
df_num.drop(to_drop, axis=1, inplace=True)
# Check shape of dataset
print_info(df_num)
Dataframe Shape: (1460, 15)
Dataframe Columns: Index(['LotFrontage', 'LotArea', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF',
'2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', 'SalePrice'],
dtype='object')
Dataframe dtypes:
int64 14
float64 1
dtype: int64
Dataframe memory usage: 0.17 MB
# Analyse how living area affects the saleprice, using a color gradient and sized scatter-dots
# The plotly library allows us to interact with our plots, feel free to hover over the data to see popups with guiding information
# You can also cut out sections of the plot to analyze only those ranges you wish to
fig = px.scatter(df, 'GrLivArea', 'SalePrice', size='SalePrice', color='FullBath', template='plotly_dark')
py.iplot(fig)
# Analyze distribution of sale price
# Again, plotly gives us the ability to interact with our plots.
# Hover over the box plots to get an easier view of the plot-data
fig = px.histogram(df, 'SalePrice', color='GarageCars', marginal='box', template='plotly_dark', height=700)
py.iplot(fig)
# Observe each feature's distribution
# The blue line is the data's distribution
# The black line is what the the 'ideal' or 'normal' distribution for the data would be, if the data is somehow manipulated
plot_histogram(df_num, distline=True)
<Figure size 432x288 with 0 Axes>
# There are a couple of features which remain constant. Let's remove them.
df_num.drop(['LowQualFinSF', 'PoolArea'], axis=1, inplace=True)
# Use pairplots to see how one feature is related to the other
features = list(df_num.columns)
features.remove('SalePrice')
# Since these can get very large, let's use only the SalePrice and the first 4 features
fig = plt.figure(figsize=(25, 10))
viz_df = pd.concat([df_num['SalePrice'], df_num[features[:4]]], axis=1)
# sns.pairplot(viz_df, diag_kind='kde', hue='SalePrice', height=2.5)
sns.pairplot(viz_df, diag_kind='kde', height=2.5)
plt.show()
<Figure size 1800x720 with 0 Axes>
# Since these can get very large, let's use only the SalePrice and the second 4 features
fig = plt.figure(figsize=(25, 10))
viz_df = pd.concat([df_num['SalePrice'], df_num[features[4:8]]], axis=1)
# sns.pairplot(viz_df, diag_kind='kde', hue='SalePrice', height=2.5)
sns.pairplot(viz_df, diag_kind='kde', height=2.5)
plt.show()
<Figure size 1800x720 with 0 Axes>
# Since these can get very large, let's use only the SalePrice and the last 3 features
fig = plt.figure(figsize=(25, 10))
viz_df = pd.concat([df_num['SalePrice'], df_num[features[9:]]], axis=1)
# sns.pairplot(viz_df, diag_kind='kde', hue='SalePrice', height=2.5)
sns.pairplot(viz_df, diag_kind='kde', height=2.5)
plt.show()
<Figure size 1800x720 with 0 Axes>
# Correlation Matrix
fig = plt.figure(figsize=(12, 10))
sns.heatmap(df_num.corr(), cmap='Greens', annot=True)
plt.show()
If pairplots appear challenging, you can use a correlation heatmap (although you will be unable to identify outliers) to see to what extent one variable affects another.
Since our target variable is SalePrice, we should be concerned about what features seem to be the most impactful.
feature importances or the features that our model found most helpful and test our hypothesis.# Now that we are done with cleaning the data, let's try to generate more features
# by using the existing ones.
pre_gen_feats = df_num.columns.values.tolist()
print(pre_gen_feats)
# For example, We can combine the 'LotArea', 'TotalBsmtSF', 'GarageArea', 'WoodDeckSF', OpenPorchSF' features to generate a new feature: 'ExtArea'
df_num['ExtArea'] = df_num['LotArea'] + df_num['TotalBsmtSF'] + df_num['GarageArea'] + df_num['WoodDeckSF'] + df_num['OpenPorchSF']
# Another example, combine all bathrooms in basement
df_num['AllBsmtBaths'] = df_num['BsmtFullBath'] + 0.5*df_num['BsmtHalfBath']
['LotFrontage', 'LotArea', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'SalePrice']
# We now need to make sure that the data has a normal distribution.
# To achieve this, we can use(among many options):
# - StandardScaler: Regular normalization, where transformed_x = (x-column_mean)/column_std_deviation
# - MinMaxScaler: Scales data between 0 and 1 (by default, but can be scaled between any two values of your choice)
# First. let's convert the categorical features to strings
df_num['BsmtFullBath'] = df_num['BsmtFullBath'].astype('str')
df_num['BsmtHalfBath'] = df_num['BsmtHalfBath'].astype('str')
df_num['AllBsmtBaths'] = df_num['AllBsmtBaths'].astype('str')
# Next, let's work with the missing values
visualize_missing_data(df_num)
# Since LotFrontage has a lot of missing values, we can:
# 1. Use `impute`:
# - impute max/min value into missing areas
# - impute mean/median/mode value into missing areas
# 2. drop the rows with the missing values
# 3. drop the whole column
# We will use option 1 as our dataset is relatively small, and losing data will not be beneficial
# Impute the mean value of the column in the missing value areas
df_num['LotFrontage'].fillna(df_num['LotFrontage'].mean(), inplace=True)
# Next, we deal with outliers
# For this, we go back to the pairplots!
# In GrLivArea vs SalePrice --> we have outliers
# It doesn't make sense to have a house of nearly 6000sqft listed for a mere 200k dollars
# Although modern gradient boosting machines are built to automatically tacke these problems, models like LinearRegression are highly prone to reduced accuracy due to outliers (Unless we use different scalers such as RobustScaler and so on...)
df_num = df_num.loc[(df_num['GrLivArea'] < 4000) & (df_num['SalePrice'] < 300000)]
# There are several more outliers in the dataset, but we will not be going through them.
# Encode the Categorical Data
cat_feats = df_num.select_dtypes('object').columns.values.tolist()
label_enc_df, onehot_enc_df = df_num.copy(), df_num.copy()
# LabelEncoding
for feat in cat_feats:
label_enc = LabelEncoder()
label_enc_df[feat] = label_enc.fit_transform(label_enc_df[feat])
# OneHotEncoding
for feat in cat_feats:
onehot_enc = OneHotEncoder(handle_unknown='ignore')
transformed = pd.DataFrame(onehot_enc.fit_transform(onehot_enc_df[feat].values.reshape(-1, 1)).toarray())
transformed.columns = [f"{feat}_{i}" for i in transformed.columns]
onehot_enc_df = onehot_enc_df.join(transformed)
onehot_enc_df.drop([feat], axis=1, inplace=True)
# We first need to split the data into train and test sets
label_X, label_Y = label_enc_df.drop('SalePrice', axis=1), label_enc_df['SalePrice']
onehot_X, onehot_Y = onehot_enc_df.drop('SalePrice', axis=1), onehot_enc_df['SalePrice']
# First, let's test see how our models perform without any scaling
rf, xgb, lgb = evaluate_performance(label_X, label_Y, test_size=0.2)
Model Scores: ------------------------- RandomForestRegressor Score: 0.64 XGBRegressor Score: 0.82 LGBMRegressor Score: 0.83
# List all features we wish to transform
feats_to_transform = ['LotFrontage', 'LotArea', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF',
'2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'ExtArea']
# Apply StandardScaler
scaler = StandardScaler()
standard_label_X = label_X.copy()
standard_label_X = scale_data(scaler, standard_label_X, feats_to_transform)
# Visualize scaled data
# plot_histogram(scaled_label_X, distline=False)
# Test model performance on new data
standardsc_rf, standardsc_xgb, standardsc_lgb = evaluate_performance(standard_label_X, label_Y, test_size=0.2)
Model Scores: ------------------------- RandomForestRegressor Score: 0.62 XGBRegressor Score: 0.82 LGBMRegressor Score: 0.82
# Apply StandardScaler
scaler = MinMaxScaler()
minmax_label_X = label_X.copy()
minmax_label_X = scale_data(scaler, minmax_label_X, feats_to_transform)
# Visualize scaled data
# plot_histogram(minmax_label_X)
# Test model performance on new data
minmaxsc_rf, minmaxsc_xgb, minmaxsc_lgb = evaluate_performance(minmax_label_X, label_Y, test_size=0.2)
Model Scores: ------------------------- RandomForestRegressor Score: 0.63 XGBRegressor Score: 0.8 LGBMRegressor Score: 0.81
# Let's see which of the features are the most important by each model
# StandardScaler Model
# RandomForestRegressor
plot_feature_importance(minmax_label_X.columns, "RandomForestRegressor Feature Importance", standardsc_rf)
# XGBRegressor
plot_feature_importance(minmax_label_X.columns, "XGBRegressor Feature Importance", standardsc_xgb)
# LGBMRegressor
plot_feature_importance(minmax_label_X.columns, "LGBMRegressor Feature Importance", standardsc_lgb)
# Let's see which of the features are the most important by each model
# MinMaxScaler Model
# RandomForestRegressor
plot_feature_importance(minmax_label_X.columns, "RandomForestRegressor Feature Importance", minmaxsc_rf)
# XGBRegressor
plot_feature_importance(minmax_label_X.columns, "XGBRegressor Feature Importance", minmaxsc_xgb)
# LGBMRegressor
plot_feature_importance(minmax_label_X.columns, "LGBMRegressor Feature Importance", minmaxsc_lgb)